This is a tutorial to creating interactive visualizations of associations between metabolites and diseases.
The data used in this tutorial have been published in the following publication:
Julkunen, H., Cichońska, A., Tiainen, M. et al. Atlas of plasma NMR biomarkers for health and disease in 118,461 individuals from the UK Biobank. Nat Commun 14, 604 (2023). https://doi.org/10.1038/s41467-023-36231-7
The data have been downloaded from https://biomarker-atlas.nightingale.cloud/ under the tab Summary statistics (accessed 2023-02-20).
View this document at https://tommi-s.com/Metabolite-Disease-Atlas/ to show all output correctly.
data <-
readr::read_csv(
file = "C:/Users/tsuv0001/Documents/data-nonsensitive-local-copies/ukb_nightingale_biomarker_disease_association_atlas.csv" # Path to the data file downloaded from https://biomarker-atlas.nightingale.cloud/ .
)
## Rows: 1400595 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): age_group, biomarker_id, biomarker_name, group_name, endpoint_type...
## dbl (5): estimate, se, pvalue, n, n_events
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names.biomarkers <- sort( names( table( data$"biomarker_name" ) ) )
names.biomarkers
## [1] "3-Hydroxybutyrate" "Acetate" "Acetoacetate"
## [4] "Acetone" "Alanine" "Albumin"
## [7] "ApoA1" "ApoB" "ApoB/ApoA1"
## [10] "Citrate" "Clinical LDL-C" "Creatinine"
## [13] "DHA" "DHA %" "Glucose"
## [16] "Glutamine" "Glycine" "Glycoprotein acetyls"
## [19] "HDL-C" "HDL-CE" "HDL-FC"
## [22] "HDL-L" "HDL-P" "HDL-PL"
## [25] "HDL-TG" "HDL particle size" "Histidine"
## [28] "IDL-C" "IDL-C %" "IDL-CE"
## [31] "IDL-CE %" "IDL-FC" "IDL-FC %"
## [34] "IDL-L" "IDL-P" "IDL-PL"
## [37] "IDL-PL %" "IDL-TG" "IDL-TG %"
## [40] "Isoleucine" "L-HDL-C" "L-HDL-C %"
## [43] "L-HDL-CE" "L-HDL-CE %" "L-HDL-FC"
## [46] "L-HDL-FC %" "L-HDL-L" "L-HDL-P"
## [49] "L-HDL-PL" "L-HDL-PL %" "L-HDL-TG"
## [52] "L-HDL-TG %" "L-LDL-C" "L-LDL-C %"
## [55] "L-LDL-CE" "L-LDL-CE %" "L-LDL-FC"
## [58] "L-LDL-FC %" "L-LDL-L" "L-LDL-P"
## [61] "L-LDL-PL" "L-LDL-PL %" "L-LDL-TG"
## [64] "L-LDL-TG %" "L-VLDL-C" "L-VLDL-C %"
## [67] "L-VLDL-CE" "L-VLDL-CE %" "L-VLDL-FC"
## [70] "L-VLDL-FC %" "L-VLDL-L" "L-VLDL-P"
## [73] "L-VLDL-PL" "L-VLDL-PL %" "L-VLDL-TG"
## [76] "L-VLDL-TG %" "LA" "LA %"
## [79] "Lactate" "LDL-C" "LDL-CE"
## [82] "LDL-FC" "LDL-L" "LDL-P"
## [85] "LDL-PL" "LDL-TG" "LDL particle size"
## [88] "Leucine" "M-HDL-C" "M-HDL-C %"
## [91] "M-HDL-CE" "M-HDL-CE %" "M-HDL-FC"
## [94] "M-HDL-FC %" "M-HDL-L" "M-HDL-P"
## [97] "M-HDL-PL" "M-HDL-PL %" "M-HDL-TG"
## [100] "M-HDL-TG %" "M-LDL-C" "M-LDL-C %"
## [103] "M-LDL-CE" "M-LDL-CE %" "M-LDL-FC"
## [106] "M-LDL-FC %" "M-LDL-L" "M-LDL-P"
## [109] "M-LDL-PL" "M-LDL-PL %" "M-LDL-TG"
## [112] "M-LDL-TG %" "M-VLDL-C" "M-VLDL-C %"
## [115] "M-VLDL-CE" "M-VLDL-CE %" "M-VLDL-FC"
## [118] "M-VLDL-FC %" "M-VLDL-L" "M-VLDL-P"
## [121] "M-VLDL-PL" "M-VLDL-PL %" "M-VLDL-TG"
## [124] "M-VLDL-TG %" "MUFA" "MUFA %"
## [127] "non-HDL-C" "Omega-3" "Omega-3 %"
## [130] "Omega-6" "Omega-6 %" "Omega-6/Omega-3"
## [133] "Phenylalanine" "Phosphatidylcholines" "Phosphoglycerides"
## [136] "PUFA" "PUFA %" "PUFA/MUFA"
## [139] "Pyruvate" "Remnant-C" "S-HDL-C"
## [142] "S-HDL-C %" "S-HDL-CE" "S-HDL-CE %"
## [145] "S-HDL-FC" "S-HDL-FC %" "S-HDL-L"
## [148] "S-HDL-P" "S-HDL-PL" "S-HDL-PL %"
## [151] "S-HDL-TG" "S-HDL-TG %" "S-LDL-C"
## [154] "S-LDL-C %" "S-LDL-CE" "S-LDL-CE %"
## [157] "S-LDL-FC" "S-LDL-FC %" "S-LDL-L"
## [160] "S-LDL-P" "S-LDL-PL" "S-LDL-PL %"
## [163] "S-LDL-TG" "S-LDL-TG %" "S-VLDL-C"
## [166] "S-VLDL-C %" "S-VLDL-CE" "S-VLDL-CE %"
## [169] "S-VLDL-FC" "S-VLDL-FC %" "S-VLDL-L"
## [172] "S-VLDL-P" "S-VLDL-PL" "S-VLDL-PL %"
## [175] "S-VLDL-TG" "S-VLDL-TG %" "SFA"
## [178] "SFA %" "Sphingomyelins" "TG/PG"
## [181] "Total-C" "Total-CE" "Total-FC"
## [184] "Total-L" "Total-P" "Total-PL"
## [187] "Total BCAA" "Total cholines" "Total fatty acids"
## [190] "Total triglycerides" "Tyrosine" "Unsaturation"
## [193] "Valine" "VLDL-C" "VLDL-CE"
## [196] "VLDL-FC" "VLDL-L" "VLDL-P"
## [199] "VLDL-PL" "VLDL-TG" "VLDL particle size"
## [202] "XL-HDL-C" "XL-HDL-C %" "XL-HDL-CE"
## [205] "XL-HDL-CE %" "XL-HDL-FC" "XL-HDL-FC %"
## [208] "XL-HDL-L" "XL-HDL-P" "XL-HDL-PL"
## [211] "XL-HDL-PL %" "XL-HDL-TG" "XL-HDL-TG %"
## [214] "XL-VLDL-C" "XL-VLDL-C %" "XL-VLDL-CE"
## [217] "XL-VLDL-CE %" "XL-VLDL-FC" "XL-VLDL-FC %"
## [220] "XL-VLDL-L" "XL-VLDL-P" "XL-VLDL-PL"
## [223] "XL-VLDL-PL %" "XL-VLDL-TG" "XL-VLDL-TG %"
## [226] "XS-VLDL-C" "XS-VLDL-C %" "XS-VLDL-CE"
## [229] "XS-VLDL-CE %" "XS-VLDL-FC" "XS-VLDL-FC %"
## [232] "XS-VLDL-L" "XS-VLDL-P" "XS-VLDL-PL"
## [235] "XS-VLDL-PL %" "XS-VLDL-TG" "XS-VLDL-TG %"
## [238] "XXL-VLDL-C" "XXL-VLDL-C %" "XXL-VLDL-CE"
## [241] "XXL-VLDL-CE %" "XXL-VLDL-FC" "XXL-VLDL-FC %"
## [244] "XXL-VLDL-L" "XXL-VLDL-P" "XXL-VLDL-PL"
## [247] "XXL-VLDL-PL %" "XXL-VLDL-TG" "XXL-VLDL-TG %"
names.biomarkers.printed <- names.biomarkers
names.biomarkers.printed <-
names.biomarkers.printed[
!grepl(
x = names.biomarkers.printed,
pattern = "DL"
)
]
names.biomarkers.printed
## [1] "3-Hydroxybutyrate" "Acetate" "Acetoacetate"
## [4] "Acetone" "Alanine" "Albumin"
## [7] "ApoA1" "ApoB" "ApoB/ApoA1"
## [10] "Citrate" "Creatinine" "DHA"
## [13] "DHA %" "Glucose" "Glutamine"
## [16] "Glycine" "Glycoprotein acetyls" "Histidine"
## [19] "Isoleucine" "LA" "LA %"
## [22] "Lactate" "Leucine" "MUFA"
## [25] "MUFA %" "Omega-3" "Omega-3 %"
## [28] "Omega-6" "Omega-6 %" "Omega-6/Omega-3"
## [31] "Phenylalanine" "Phosphatidylcholines" "Phosphoglycerides"
## [34] "PUFA" "PUFA %" "PUFA/MUFA"
## [37] "Pyruvate" "Remnant-C" "SFA"
## [40] "SFA %" "Sphingomyelins" "TG/PG"
## [43] "Total-C" "Total-CE" "Total-FC"
## [46] "Total-L" "Total-P" "Total-PL"
## [49] "Total BCAA" "Total cholines" "Total fatty acids"
## [52] "Total triglycerides" "Tyrosine" "Unsaturation"
## [55] "Valine"
table( data$"age_group" )
##
## 1st tertile (39-53, statin use 6%) 2nd tertile (54-61, statin use 17%)
## 352522 346890
## 3rd tertile (62-71, statin use 30%) Full population
## 343869 357314
table( data$"endpoint_type" )
##
## incident mortality prevalent
## 708025 72697 619873
data.plot <- data
data.plot <- data.plot[ data.plot$"age_group" == "Full population", ]
data.plot <- data.plot[ data.plot$"endpoint_type" == "incident", ]
data.plot <-
data.plot[ data.plot$"biomarker_name" %in% names.biomarkers.printed, ]
data.plot.pca <-
tidyr::pivot_wider(
data = data.plot,
id_cols = icd10_desc,
names_from = biomarker_name,
# values_from = estimate.significant
values_from = estimate
)
tmp <- data.plot.pca$"icd10_desc"
# Extract disease group from disease code (condition).
# Extract most increased and most decreased metabolites for each condition.
data.plot.pca <-
data.frame(
Condition_Name = unlist( data.plot.pca[ , 1 ] ),
Disease_Group =
stringr::str_sub(
string = unlist( data.plot.pca[ , 1 ] ),
start = 1,
end = 1
),
Top_Increase =
colnames( data.plot.pca )[
apply(
X = data.plot.pca[ , -1 ],
MAR = 1,
FUN = which.max
) + 1
],
Top_Decrease =
colnames( data.plot.pca )[
apply(
X = data.plot.pca[ , -1 ],
MAR = 1,
FUN = which.min
) + 1
],
Condition = NA,
data.plot.pca[ , -1 ]
)
# Create a text for tooltip.
data.plot.pca$"Condition" <-
paste0(
data.plot.pca$"Condition_Name",
"\n\nMost Increased: ",
data.plot.pca$"Top_Increase",
"\nMost Decreased: ",
data.plot.pca$"Top_Decrease"
)
# Set disease codes as rownames.
rownames( data.plot.pca ) <- tmp
has.any.missing <-
apply(
X = is.na( data.plot.pca[ , -( 1:5 ) ] ),
MAR = 1,
FUN = any
)
table( has.any.missing )
## has.any.missing
## FALSE TRUE
## 701 13
data.plot.pca <- data.plot.pca[ !has.any.missing, ]
result.pca <- prcomp( data.plot.pca[ , -( 1:5 ) ] )
library( "ggfortify" )
## Loading required package: ggplot2
plot.pca <-
autoplot(
result.pca,
data = data.plot.pca,
alpha = 0.5,
loadings.colour = "black",
loadings.label = TRUE,
loadings.label.colour = "black",
shape = "Condition",
colour = "Disease_Group"
) +
ggplot2::theme( legend.position = "none" )
print( plot.pca )
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 701. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 695 rows containing missing values (geom_point).
plot.pca.interactive <-
plotly::ggplotly(
p = plot.pca,
tooltip = "shape"
)
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 701. Consider
## specifying shapes manually if you must have them.
plot.pca.interactive
data.plot$"Association" <- data.plot$"estimate"
data.plot[ data.plot$"pvalue" > 0.01, "Association" ] <- 0
data.plot.wide <-
tidyr::pivot_wider(
data = data.plot,
id_cols = icd10_desc,
names_from = biomarker_name,
values_from = Association
)
tmp <- data.plot.wide$"icd10_desc"
data.plot.wide <- as.matrix( data.plot.wide[ , -1 ] )
rownames( data.plot.wide ) <- tmp
data.plot.wide[ is.na( data.plot.wide ) ] <- 0
is.all.zeros <-
apply(
X = data.plot.wide == 0,
MAR = 1,
FUN = all
)
table( is.all.zeros )
## is.all.zeros
## FALSE TRUE
## 607 107
conditions.all.zeros <- rownames( data.plot.wide )[ is.all.zeros ]
data.plot.wide <- data.plot.wide[ !is.all.zeros, ]
result.heatmap <-
gplots::heatmap.2(
x = data.plot.wide,
trace = "none",
tracecol = "black",
col = gplots::bluered( n = 99 ),
margins = c( 15, 15 ),
srtCol = 45,
cexCol = 2
)
data.plot.heatmap <- data.plot
data.plot.heatmap <-
data.plot.heatmap[
!( data.plot.heatmap$"icd10_desc" %in% conditions.all.zeros ),
]
data.plot.heatmap$"biomarker_name" <-
factor(
x = data.plot.heatmap$"biomarker_name",
levels = colnames( data.plot.wide )[ result.heatmap$"colInd" ]
)
data.plot.heatmap$"icd10_desc" <-
factor(
x = data.plot.heatmap$"icd10_desc",
levels = rownames( data.plot.wide )[ result.heatmap$"rowInd" ],
labels =
paste(
formatC(
x = 1:nrow( data.plot.wide ),
flag = "0",
width = 3
),
rownames( data.plot.wide )[ result.heatmap$"rowInd" ]
)
)
data.plot.heatmap$"annotation" <-
paste0(
"p: ",
data.plot.heatmap$"pvalue"
)
val.min <- round( min( data.plot.heatmap$"Association" ) * 100 )
val.max <- round( max( data.plot.heatmap$"Association" ) * 100 )
range <- max( abs( val.max ), abs( val.min ) ) * 2
palette <- gplots::bluered( n = range )
palette <- palette[ round( range / 2 + val.min ) : round( range / 2 + val.max ) ]
data.plot.heatmap$"Association"[ data.plot.heatmap$"pvalue" > 0.01 ] <- NA
data.plot.heatmap$"icd10_desc" <-
stringr::str_sub(
string = data.plot.heatmap$"icd10_desc",
start = 1,
end = 30
)
# Create the heatmap.
plot.heatmap.interactive <-
plotly::plot_ly(
data = data.plot.heatmap,
x = ~biomarker_name,
y = ~icd10_desc,
z = ~Association,
text = ~annotation,
colors = gplots::bluered( n = 99 ),
type = "heatmap"
)
# Define the colorscale as symmetric.
plot.heatmap.interactive <-
plotly::colorbar(
p = plot.heatmap.interactive,
limits =
c( -1, 1 ) * max( x = abs( data.plot.heatmap$"Association" ), na.rm = TRUE )
)
# Omit axis titles, adjust text size and omit background grid.
plot.heatmap.interactive <-
plotly::layout(
p = plot.heatmap.interactive,
xaxis =
list(
# categoryorder = "array",
# categoryarray = colnames( data.plot.wide )[ result.heatmap$"colInd"],
showgrid = FALSE,
tickfont = list( size = 7 ),
title = ""
),
yaxis =
list(
# categoryorder = "array",
# categoryarray = rownames( data.plot.wide )[ result.heatmap$"rowInd" ],
showgrid = FALSE,
tickfont = list( size = 7 ),
title = ""
)
)
plot.heatmap.interactive
utils::sessionInfo()
## R version 4.2.0 (2022-04-22 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Europe.utf8 LC_CTYPE=English_Europe.utf8
## [3] LC_MONETARY=English_Europe.utf8 LC_NUMERIC=C
## [5] LC_TIME=English_Europe.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggfortify_0.4.14 ggplot2_3.3.6
##
## loaded via a namespace (and not attached):
## [1] gtools_3.9.2.1 tidyselect_1.1.2 xfun_0.31 bslib_0.3.1
## [5] purrr_0.3.4 colorspace_2.0-3 vctrs_0.4.1 generics_0.1.2
## [9] viridisLite_0.4.0 htmltools_0.5.2 yaml_2.3.5 utf8_1.2.2
## [13] plotly_4.10.0 rlang_1.0.2 jquerylib_0.1.4 pillar_1.7.0
## [17] glue_1.6.2 withr_2.5.0 bit64_4.0.5 lifecycle_1.0.1
## [21] stringr_1.4.0 munsell_0.5.0 gtable_0.3.0 caTools_1.18.2
## [25] htmlwidgets_1.5.4 evaluate_0.15 labeling_0.4.2 knitr_1.39
## [29] tzdb_0.3.0 fastmap_1.1.0 crosstalk_1.2.0 parallel_4.2.0
## [33] fansi_1.0.3 highr_0.9 KernSmooth_2.23-20 readr_2.1.2
## [37] scales_1.2.0 vroom_1.5.7 jsonlite_1.8.0 farver_2.1.0
## [41] bit_4.0.4 gplots_3.1.3 gridExtra_2.3 hms_1.1.1
## [45] digest_0.6.29 stringi_1.7.6 dplyr_1.0.9 grid_4.2.0
## [49] bitops_1.0-7 cli_3.3.0 tools_4.2.0 magrittr_2.0.3
## [53] sass_0.4.1 lazyeval_0.2.2 tibble_3.1.7 crayon_1.5.1
## [57] tidyr_1.2.0 pkgconfig_2.0.3 ellipsis_0.3.2 data.table_1.14.2
## [61] rmarkdown_2.14 httr_1.4.3 rstudioapi_0.13 R6_2.5.1
## [65] compiler_4.2.0
if ( file.exists( "README.html" ) ) {
file.copy(
from = "README.html",
to = "index.html",
overwrite = TRUE
)
}
## [1] TRUE